require(quanteda)
library(newsmap)
require(LSX)

toks <- readRDS("data/tokens_nytimes_summary.RDS")

# Newsmap --------------------

dict_newsmap <- dictionary(file = 'historical.yml')
toks_cap <- tokens_select(toks, '^[A-Z][A-Za-z1-2]+', valuetype = 'regex', 
                          case_insensitive = FALSE)

mt_dict <- toks_cap %>% 
           tokens_lookup(dict_newsmap, levels = 3, nested_scope = "dictionary") %>% 
           dfm()
mt_feat <- toks_cap %>% 
           dfm(tolower = FALSE) %>% 
           dfm_trim(min_termfreq = 100)
newsmap <- textmodel_newsmap(mt_feat, mt_dict)
saveRDS(newsmap, "newsmap.RDS")
pred_newsmap <- as.data.frame(predict(newsmap, confidence.fit = TRUE))

# LSS -------------------------

dict_lss <- dictionary(file = 'keywords.yml')
mt <- toks %>% 
    tokens_remove("^[A-Z]", valuetype = "regex", case_insensitive = FALSE) %>% 
    dfm() %>% 
    dfm_trim(min_termfreq = 10)

seed <- as.seedwords(dict_lss$seedwords)
lss <- textmodel_lss(mt, seed, cache = TRUE, weight = "logcount")
saveRDS(lss, "lss.RDS")
pred_lss <- predict(lss, rescaling = TRUE, newdata = mt)

# Combine -------------------

dat <- data.frame(docid = docid(mt),
                  country = pred_newsmap,
                  lss = pred_lss,
                  year = docvars(mt, "year"))

dat$country.class[dat$country.confidence.fit <= 1] <- "us"
dat$threat <- dat$lss > 0

saveRDS(dat, "class.RDS")
